{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "read_dir='E:/文档/数据集/Data Sets used in Computing Education/ASSISTment Data Sets/ASSISTment Data/2009-2010 ASSISTment Data/Skill-builder data 2009-2010/'\n",
    "save_dir='../data/a0910/'\n",
    "\n",
    "prob_count_limit=15 #每个学生做过的习题下限\n",
    "data_set=pd.read_csv(read_dir+'skill_builder_data_corrected.csv',encoding='ISO-8859-1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "data=data_set.loc[:,['user_id','problem_id','correct','skill_name']]\\\n",
    "    .drop_duplicates(subset=['user_id','problem_id','skill_name'],keep='first').dropna(axis=0,how='any')\n",
    "data.columns=['user_id','item_id','score','knowledge_code']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "record=data.loc[:,['user_id','item_id','score']].drop_duplicates(subset=['user_id','item_id'],keep='first').dropna(axis=0,how='any')\n",
    "item_conc_data=data.loc[:,['item_id','knowledge_code']].drop_duplicates(keep='first').dropna(axis=0,how='any')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "有效的学生数： 2380\n"
     ]
    }
   ],
   "source": [
    "# 统计每个学生做了多少道题\n",
    "problem_counter=record.groupby(by='user_id').count()\n",
    "\n",
    "filtered_stu_boundary=prob_count_limit #学生做的题超过15道才算数\n",
    "\n",
    "filtered_stu_id=problem_counter[problem_counter['item_id']>filtered_stu_boundary].index.to_numpy()\n",
    "\n",
    "print('有效的学生数：',len(filtered_stu_id))\n",
    "\n",
    "record=record.set_index('user_id').loc[filtered_stu_id,:].reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_unique=np.unique(record['item_id'])\n",
    "know_unique=np.unique(item_conc_data['knowledge_code'])\n",
    "stu_unique=np.unique(record['user_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "stu_old_new=dict(zip(stu_unique,range(1,len(stu_unique)+1)))\n",
    "item_old_new=dict(zip(item_unique,range(1,len(item_unique)+1)))\n",
    "know_old_new=dict(zip(know_unique,range(1,len(know_unique)+1)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(save_dir+'dict_knowledge_code.json','w') as f:\n",
    "    json.dump(know_old_new,f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done\n"
     ]
    }
   ],
   "source": [
    "record['user_id']=record['user_id'].map(stu_old_new)\n",
    "record['item_id']=record['item_id'].map(item_old_new)\n",
    "\n",
    "item_conc_data['item_id']=item_conc_data['item_id'].map(item_old_new)\n",
    "item_conc_data['knowledge_code']=item_conc_data['knowledge_code'].map(know_old_new)\n",
    "\n",
    "print('Done')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_open_df=item_conc_data.set_index('item_id')\n",
    "\n",
    "item_df=pd.DataFrame(columns=['item_id','knowledge_code'],index=range(1,len(item_unique)+1))\n",
    "for item in range(1,len(item_unique)+1):\n",
    "    item_df.loc[item,'item_id']=item\n",
    "    item_df.loc[item,'knowledge_code']=np.array(item_open_df.loc[item,['knowledge_code']]).reshape(-1).tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "item_df.to_csv(save_dir+'item.csv',index=False)\n",
    "record.to_csv(save_dir+'record.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "学习者数： 2380\n",
      "习题数： 16804\n",
      "知识点数： 110\n",
      "记录数： 257585\n"
     ]
    }
   ],
   "source": [
    "print('学习者数：',len(stu_unique))\n",
    "print('习题数：',len(item_unique))\n",
    "print('知识点数：',len(know_unique))\n",
    "print('记录数：',len(record))"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "527a93331b4b1a8345148922acc34427fb7591433d63b66d32040b6fbbc6d593"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 64-bit ('pytorch': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
